"""PDF module"""

from tika import parser

import nltk


def get_pdf_content(pdf_path) -> str:
    """"Returns the content of a pdf file"""
    parsed = parser.from_file(pdf_path)
    content: str = parsed['content']
    content = content.strip()
    content.replace('\n','').replace('\r', '')
    return nltk.sent_tokenize(content, language='russian')


if __name__ == '__main__':
    from pprint import pprint as pp
    # nltk.download('punkt')
    sents = get_pdf_content('test.pdf')
    pp(sents[:10])
